<!DOCTYPE html>
<title>Tokenizers - nearley.js - JS Parsing Toolkit</title>
<meta charset="utf-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="description" content="nearley.js is a simple, fast, and powerful parser toolkit for JavaScript." />
<style>
body {
    font-family: BlinkMacSystemFont, -apple-system, Helvetica Neue, Open Sans, Ubuntu, Arial, Helvetica, sans-serif;
    font-weight: 300;
    font-size: 15pt;
    margin: 0px;
    padding: 0px;
}

#main a {
    color: #55a;
}
#main a:visited {
    color: #55a;
}
#main a:hover {
    color: #259;
}

h1 {
    font-size: 1.2em;
    font-weight: normal;
    letter-spacing: 0.2em;
    text-align: center;
    margin:  0em;
    padding: 1em;
}

h1 a {
    color: #559;
    text-decoration: none;
}

h1 a:hover {
    color: orange;
    cursor: pointer;
}

#version {
    background: orange;
    border-radius: 50%;
    font-weight: bold;
    color: white;
    width: 3em;
    height: 3em;
    display: inline-block;
    text-align: center;
    line-height: 3em;
    letter-spacing: 0em;
}

h2 {
    font-size: 1.5em;
    margin-top: 2em;
}
h3 {
    margin-top: 3em;
}
h4 {
    margin-top: 3em;
}

.center {
    padding-left: 2em;
    padding-right: 2em;
    margin-left: auto;
    margin-right: auto;
}
#main {
    max-width: 35em;
}

#main p, ul, ol {
    line-height: 1.5em;
    margin-bottom: 1em;
}

#main li {
    margin-top: 1em;
    margin-bottom: 1em;
}

#main ul li {
    list-style: none;
    min-width: 10em;
}
#main ul li:before {
    display: inline-block;
    content: "\2014";
    margin-left: -1.5em;
    width: 1.5em;
    position: absolute;
}

#main textarea {
    width: 100%;
    height: 20em;
    resize: none;
    margin-top: 5px;
    margin-bottom: 5px;

    font-family: menlo, monaco, monospace;
    font-size: 10pt;
    padding: 5px;

    border: none;
    border-left: solid 1px #aaa;
}
#main textarea:focus {
    outline: none;
}

#main img {
    display: block;
    margin-left: auto;
    margin-right: auto;
    max-width: 100%;
}
img + .caption {
    text-align: center;
    font-size: 0.7em;
    color: #aaa;
}

#footer {
    margin-top: 10em;
    color: #aaa;
    font-size: 0.7em;
    padding: 5em;
}

code {
    margin-left: 0.2em;
    margin-right: 0.2em;
    padding: 0.2em;
    border-radius: 0.2em;
    background-color: #eee;
}

pre code {
    border-radius: 0.5em;
    padding: 0.5em;
    display: block;
    overflow: auto;
}


ol {
    counter-reset: tutorial-counter;
    position: relative;
}

ol li {
    list-style: none;
}

ol li:before {
    content: counter(tutorial-counter);
    counter-increment: tutorial-counter;

    font-weight: bold;
    border-radius: 50%;
    background-color: #559;
    width: 2em;
    height: 2em;
    display: inline-block;
    text-align: center;
    line-height: 2em;
    color: white;

    margin-left: -2em;

    position: absolute;
    left: 1em;
}

input[type="text"] {
    border: none;
    font-family: monospace;
    font-size: 1em;
    color: #559;
}

input[type="text"]:focus {
    outline: none;
}


nav {
    border-bottom: 2px solid #559;
    background: #fafafa;
}
nav ul {
    padding: 0;
    margin: 0;
}
nav ul a {
    text-decoration: none;
    color: #334;
    display: block;
    overflow: hidden;
    text-overflow: ellipsis;
    padding: 0.25em 0.5em; */
}
.page {
    display: block;
}

.page-title-active,
.page-heading-active a {
    color: #66f;
}

.page-heading {
    display: block;
    padding-left: 1.5em;
}


@media screen and (min-width: 36rem) {
    nav {
        position: fixed;
        width: 16rem;
        left: 0;
        top: 0;
        bottom: 0;
        white-space: nowrap;
        overflow-y: auto;
        font-size: 1rem;
        border-bottom: none;
        border-right: 2px solid #559;
    }
    body.docs-page {
        margin-left: 16rem;
    }
}

/* highlight.js */

.hljs{display:block;overflow-x:auto;padding:0.5em;background:#F0F0F0}.hljs,.hljs-subst{color:#444}.hljs-comment{color:#888888}.hljs-keyword,.hljs-attribute,.hljs-selector-tag,.hljs-meta-keyword,.hljs-doctag,.hljs-name{font-weight:bold}.hljs-type,.hljs-string,.hljs-number,.hljs-selector-id,.hljs-selector-class,.hljs-quote,.hljs-template-tag,.hljs-deletion{color:#880000}.hljs-title,.hljs-section{color:#880000;font-weight:bold}.hljs-regexp,.hljs-symbol,.hljs-variable,.hljs-template-variable,.hljs-link,.hljs-selector-attr,.hljs-selector-pseudo{color:#BC6060}.hljs-literal{color:#78A960}.hljs-built_in,.hljs-bullet,.hljs-code,.hljs-addition{color:#397300}.hljs-meta{color:#1f7199}.hljs-meta-string{color:#4d99bf}.hljs-emphasis{font-style:italic}.hljs-strong{font-weight:bold}


</style>

<body class="docs-page">

<a href="https://github.com/kach/nearley" class="github-corner"><svg width="80" height="80" viewBox="0 0 250 250" style="fill:#151513; color:#fff; position: absolute; top: 0; border: 0; right: 0;"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path><path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"></path><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"></path></svg></a><style>.github-corner:hover .octo-arm{animation:octocat-wave 560ms ease-in-out}@keyframes octocat-wave{0%,100%{transform:rotate(0)}20%,60%{transform:rotate(-25deg)}40%,80%{transform:rotate(10deg)}}@media (max-width:500px){.github-corner:hover .octo-arm{animation:none}.github-corner .octo-arm{animation:octocat-wave 560ms ease-in-out}}</style>
<nav id="nav">
    <h1><a href="/">nearley<span style="color: #559;">.js</span><span id="version">2.20.1</span></a></h1>
    <ul>
        <li class="page"><a href="/docs/index">Home</a>
        <li class="page"><a href="/docs/getting-started">Getting started</a>
        <li class="page"><a href="/docs/grammar">Writing a parser</a>
        <li class="page"><a href="/docs/parser">Using a parser</a>
        <li class="page">
            <a href="#" class="page-title-active">Tokenizers</a>
            <ul>
                <li class="page-heading"><a href="#lexing-with-moo">Lexing with Moo</a>
                <li class="page-heading"><a href="#custom-lexers">Custom lexers</a>
                <li class="page-heading"><a href="#custom-token-matchers">Custom token matchers</a>
            </ul>
        </li>
        <li class="page"><a href="/docs/tooling">Tooling</a>
        <li class="page"><a href="/docs/how-to-grammar-good">How to grammar good</a>
        <li class="page"><a href="/docs/using-in-frontend">Compiling in browsers</a>
        <li class="page"><a href="/docs/glossary">Glossary</a>
    </ul>
</nav>

<div id="main" class="center">

<h2>Tokenizers</h2>
<p>By default, nearley splits the input into a stream of characters. This is
called <em>scannerless</em> parsing.</p>
<p>A tokenizer splits the input into a stream of larger units called <em>tokens</em>.
This happens in a separate stage before parsing. For example, a tokenizer might
convert <code>512 + 10</code> into <code>[&quot;512&quot;, &quot;+&quot;, &quot;10&quot;]</code>: notice how it removed the
whitespace, and combined multi-digit numbers into a single number.</p>
<p>Using a tokenizer has many benefits. It…</p>
<ul>
<li>…often makes your parser faster by more than an order of magnitude.</li>
<li>…allows you to write cleaner, more maintainable grammars.</li>
<li>…helps avoid ambiguous grammars in some cases. For example, a tokenizer can
easily tell you that <code>superclass</code> is a single keyword, not a sequence of
<code>super</code> and <code>class</code> keywords.</li>
<li>…gives you <em>lexical information</em> such as line numbers for each token. This
lets you make better error messages.</li>
</ul>
<h3 id="lexing-with-moo">Lexing with Moo</h3>
<p>The <code>@lexer</code> directive instructs Nearley to use a lexer you’ve defined inside a
<a href="grammar#additional-js">Javascript block</a> in your grammar.</p>
<p>nearley supports and recommends <a href="https://github.com/tjvr/moo">Moo</a>, a
super-fast lexer. Construct a lexer using <code>moo.compile</code>.</p>
<p>When using a lexer, there are two ways to match tokens:</p>
<ul>
<li><p>Use <code>%token</code> to match a token with <strong>type</strong> <code>token</code>.</p>
<pre><code class="language-ne">line -&gt; words %newline</code></pre>
</li>
<li><p>Use <code>&quot;foo&quot;</code> to match a token with <strong>text</strong> <code>foo</code>.</p>
<p>This is convenient for matching keywords:</p>
<pre><code class="language-ne">ifStatement -&gt; &quot;if&quot; condition &quot;then&quot; block</code></pre>
</li>
</ul>
<p>Here is an example of a simple grammar:</p>
<pre><code class="language-ne">@{%
const moo = require(&quot;moo&quot;);

const lexer = moo.compile({
  ws:     /[ \t]+/,
  number: /[0-9]+/,
  word: { match: /[a-z]+/, type: moo.keywords({ times: &quot;x&quot; }) },
  times:  /\*/
});
%}

# Pass your lexer object using the @lexer option:
@lexer lexer

expr -&gt; multiplication {% id %} | trig {% id %}

# Use %token to match any token of that type instead of &quot;token&quot;:
multiplication -&gt; %number %ws %times %ws %number {% ([first, , , , second]) =&gt; first * second %}

# Literal strings now match tokens with that text:
trig -&gt; &quot;sin&quot; %ws %number {% ([, , x]) =&gt; Math.sin(x) %}</code></pre>
<p>Have a look at <a href="https://github.com/tjvr/moo#usage">the Moo documentation</a> to
learn more about writing a tokenizer.</p>
<p>You use the parser as usual: call <code>parser.feed(data)</code>, and nearley will give
you the parsed results in return.</p>
<h3 id="custom-lexers">Custom lexers</h3>
<p>nearley recommends using a <a href="https://github.com/tjvr/moo">moo</a>-based lexer.
However, you can use any lexer that conforms to the following interface:</p>
<ul>
<li><code>next()</code> returns a token object, which could have fields for line number,
etc. Importantly, a token object <em>must</em> have a <code>value</code> attribute.</li>
<li><code>save()</code> returns an info object that describes the current state of the
lexer. nearley places no restrictions on this object.</li>
<li><code>reset(chunk, info)</code> sets the internal buffer of the lexer to <code>chunk</code>, and
restores its state to a state returned by <code>save()</code>.</li>
<li><code>formatError(token)</code> returns a string with an error message describing a
parse error at that token (for example, the string might contain the line and
column where the error was found).</li>
<li><code>has(name)</code> returns true if the lexer can emit tokens with that name. This is
used to resolve <code>%</code>-specifiers in compiled nearley grammars.</li>
</ul>
<blockquote>
<p>Note: if you are searching for a lexer that allows indentation-aware
grammars (like in Python), you can still use moo. See <a href="https://gist.github.com/nathan/d8d1adea38a1ef3a6d6a06552da641aa">this
example</a> or
the
<a href="https://www.npmjs.com/package/moo-indentation-lexer">moo-indentation-lexer</a>
module.</p>
</blockquote>
<h3 id="custom-token-matchers">Custom token matchers</h3>
<p>Aside from the lexer infrastructure, nearley provides a lightweight way to
parse arbitrary streams.</p>
<p>Custom matchers can be defined in two ways: <em>literal</em> tokens and <em>testable</em>
tokens. A literal token matches a JS value exactly (with <code>===</code>), while a
testable token runs a predicate that tests whether or not the value matches.</p>
<p>Note that in this case, you would feed a <code>Parser</code> instance an <em>array</em> of
objects rather than a string! Here is a simple example:</p>
<pre><code class="language-ne">@{%
const tokenPrint = { literal: &quot;print&quot; };
const tokenNumber = { test: x =&gt; Number.isInteger(x) };
%}

main -&gt; %tokenPrint %tokenNumber &quot;;;&quot;

# parser.feed([&quot;print&quot;, 12, &quot;;&quot;, &quot;;&quot;]);</code></pre>


    <div id="footer"><p>nearley is MIT-licensed. It's maintained by <a
    href="http://hardmath123.github.io">Hardmath123</a> (<a
    href="http://github.com/Hardmath123">Github</a>). You're welcome to
    leave questions, comments, advice, or ideas as Github issues. And
    feel free to fork nearley with your own experiments!<br/>
    
    <a href="https://js.org" target="_blank" title="JS.ORG | JavaScript Community">
    <img src="https://logo.js.org/dark_horz.png" width="102" alt="JS.ORG Logo"/></a>
</div>

<script id="highlight-js" async src="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
<script>
document.querySelector('#highlight-js').addEventListener('load', function() {
    [].slice.apply(document.querySelectorAll('pre code')).forEach(function(b) {
        hljs.highlightBlock(b)
    })
})
</script>


<script>
window.addEventListener('scroll', onScroll)
window.addEventListener('resize', onScroll)

var timeout
function onScroll(e) {
    if (timeout) clearTimeout(timeout)
    timeout = setTimeout(scrollEnd, 50)
}

function scrollEnd() {
  var headings = document.querySelectorAll('h3')
  var offset = 60
  for (var i=headings.length; i--; ) {
    var h3 = headings[i]
    if (h3.offsetTop < window.scrollY + offset) {
      var cur = document.querySelector('.page-heading-active')
      if (cur) cur.className = 'page-heading'
      document.querySelector('a[href="#' + h3.id + '"]').parentNode.className = 'page-heading page-heading-active'
      return
    }
  }
}
</script>

<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-46120535-5', 'auto');
ga('send', 'pageview');
</script>

